In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Comment this if the data visualisations doesn't work
%matplotlib inline

plt.style.use('bmh')
df = pd.read_csv("/Users/Prosenjeet Saha/Desktop/Data/Gold Price Data.csv")
df.head()
Out[1]:
Date Open High Low Close WAP No. of Shares No. of Trades Total Turnover Deliverable Quantity % Deli. Qty to Traded Qty Spread H-L Spread C-O
0 2017-02-06 0.79 0.79 0.76 0.76 0.79 7430.0 7.0 5848.0 7430.0 100.0 0.03 -0.03
1 2017-02-03 0.79 0.79 0.79 0.79 0.79 310.0 4.0 244.0 310.0 100.0 0.00 0.00
2 2017-02-02 0.83 0.83 0.83 0.83 0.83 75.0 1.0 62.0 75.0 100.0 0.00 0.00
3 2017-01-31 0.87 0.87 0.87 0.87 0.87 1050.0 2.0 913.0 1050.0 100.0 0.00 0.00
4 2017-01-25 0.91 0.91 0.91 0.91 0.91 400.0 1.0 364.0 400.0 100.0 0.00 0.00
In [2]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1660 entries, 0 to 1659
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Date                       1660 non-null   object 
 1   Open                       1660 non-null   float64
 2   High                       1660 non-null   float64
 3   Low                        1660 non-null   float64
 4   Close                      1660 non-null   float64
 5   WAP                        1660 non-null   float64
 6   No. of Shares              1660 non-null   float64
 7   No. of Trades              1660 non-null   float64
 8   Total Turnover             1660 non-null   float64
 9   Deliverable Quantity       1660 non-null   float64
 10  % Deli. Qty to Traded Qty  1660 non-null   float64
 11  Spread H-L                 1660 non-null   float64
 12  Spread C-O                 1660 non-null   float64
dtypes: float64(12), object(1)
memory usage: 168.7+ KB
In [3]:
df.describe()
plt.figure(figsize=(9, 8))
sns.distplot(df.describe(), color='g', bins=100, hist_kws={'alpha': 0.4});
C:\Users\Prosenjeet Saha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
In [4]:
#Numerical data distribution
list(set(df.dtypes.tolist()))
Out[4]:
[dtype('O'), dtype('float64')]
In [5]:
df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()
Out[5]:
Open High Low Close WAP No. of Shares No. of Trades Total Turnover Deliverable Quantity % Deli. Qty to Traded Qty Spread H-L Spread C-O
0 0.79 0.79 0.76 0.76 0.79 7430.0 7.0 5848.0 7430.0 100.0 0.03 -0.03
1 0.79 0.79 0.79 0.79 0.79 310.0 4.0 244.0 310.0 100.0 0.00 0.00
2 0.83 0.83 0.83 0.83 0.83 75.0 1.0 62.0 75.0 100.0 0.00 0.00
3 0.87 0.87 0.87 0.87 0.87 1050.0 2.0 913.0 1050.0 100.0 0.00 0.00
4 0.91 0.91 0.91 0.91 0.91 400.0 1.0 364.0 400.0 100.0 0.00 0.00
In [6]:
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8); # ; avoid having the matplotlib verbose informations
In [7]:
#checking the data types
df.dtypes
Out[7]:
Date                          object
Open                         float64
High                         float64
Low                          float64
Close                        float64
WAP                          float64
No. of Shares                float64
No. of Trades                float64
Total Turnover               float64
Deliverable Quantity         float64
% Deli. Qty to Traded Qty    float64
Spread H-L                   float64
Spread C-O                   float64
dtype: object
In [8]:
#Dropping the duplicate rows
df.shape
Out[8]:
(1660, 13)
In [9]:
duplicate_rows_df = df[df.duplicated()]
print("number of duplicate rows: ", duplicate_rows_df.shape)
number of duplicate rows:  (0, 13)
In [10]:
#Now let us remove the duplicate data because it's ok to remove them
df.count()      # Used to count the number of rows
Out[10]:
Date                         1660
Open                         1660
High                         1660
Low                          1660
Close                        1660
WAP                          1660
No. of Shares                1660
No. of Trades                1660
Total Turnover               1660
Deliverable Quantity         1660
% Deli. Qty to Traded Qty    1660
Spread H-L                   1660
Spread C-O                   1660
dtype: int64
In [11]:
df = df.drop_duplicates()
df.head(5)
Out[11]:
Date Open High Low Close WAP No. of Shares No. of Trades Total Turnover Deliverable Quantity % Deli. Qty to Traded Qty Spread H-L Spread C-O
0 2017-02-06 0.79 0.79 0.76 0.76 0.79 7430.0 7.0 5848.0 7430.0 100.0 0.03 -0.03
1 2017-02-03 0.79 0.79 0.79 0.79 0.79 310.0 4.0 244.0 310.0 100.0 0.00 0.00
2 2017-02-02 0.83 0.83 0.83 0.83 0.83 75.0 1.0 62.0 75.0 100.0 0.00 0.00
3 2017-01-31 0.87 0.87 0.87 0.87 0.87 1050.0 2.0 913.0 1050.0 100.0 0.00 0.00
4 2017-01-25 0.91 0.91 0.91 0.91 0.91 400.0 1.0 364.0 400.0 100.0 0.00 0.00
In [12]:
df.count()
Out[12]:
Date                         1660
Open                         1660
High                         1660
Low                          1660
Close                        1660
WAP                          1660
No. of Shares                1660
No. of Trades                1660
Total Turnover               1660
Deliverable Quantity         1660
% Deli. Qty to Traded Qty    1660
Spread H-L                   1660
Spread C-O                   1660
dtype: int64
In [13]:
#Dropping the missing or null values
print(df.isnull().sum())
Date                         0
Open                         0
High                         0
Low                          0
Close                        0
WAP                          0
No. of Shares                0
No. of Trades                0
Total Turnover               0
Deliverable Quantity         0
% Deli. Qty to Traded Qty    0
Spread H-L                   0
Spread C-O                   0
dtype: int64
In [14]:
#Detecting Outliers
sns.boxplot(x=df['Open'])
Out[14]:
<Axes: xlabel='Open'>
In [15]:
sns.boxplot(x=df['High'])
Out[15]:
<Axes: xlabel='High'>
In [16]:
sns.boxplot(x=df['Low'])
Out[16]:
<Axes: xlabel='Low'>
In [17]:
sns.boxplot(x=df['Close'])
Out[17]:
<Axes: xlabel='Close'>
In [18]:
sns.boxplot(x=df['No. of Shares'])
Out[18]:
<Axes: xlabel='No. of Shares'>
In [19]:
sns.boxplot(x=df['No. of Trades'])
Out[19]:
<Axes: xlabel='No. of Trades'>
In [20]:
sns.boxplot(x=df['Total Turnover'])
Out[20]:
<Axes: xlabel='Total Turnover'>
In [21]:
sns.boxplot(x=df['Deliverable Quantity'])
Out[21]:
<Axes: xlabel='Deliverable Quantity'>
In [22]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
print(IQR)
Open                         1.004250e+01
High                         1.056000e+01
Low                          9.930000e+00
Close                        1.002750e+01
WAP                          1.011550e+01
No. of Shares                1.490235e+05
No. of Trades                1.712500e+02
Total Turnover               2.050616e+06
Deliverable Quantity         1.184230e+05
% Deli. Qty to Traded Qty    2.106750e+01
Spread H-L                   8.500000e-01
Spread C-O                   1.800000e-01
dtype: float64
In [23]:
df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
df.shape
C:\Users\Prosenjeet Saha\AppData\Local\Temp\ipykernel_12144\4147643536.py:1: FutureWarning: Automatic reindexing on DataFrame vs Series comparisons is deprecated and will raise ValueError in a future version. Do `left, right = left.align(right, axis=1, copy=False)` before e.g. `left == right`
  df = df[~((df < (Q1 - 1.5 * IQR)) |(df > (Q3 + 1.5 * IQR))).any(axis=1)]
Out[23]:
(1132, 13)
In [24]:
#Plot different features against one another (scatter), against frequency (histogram)
df.hist()
Out[24]:
array([[<Axes: title={'center': 'Open'}>,
        <Axes: title={'center': 'High'}>,
        <Axes: title={'center': 'Low'}>],
       [<Axes: title={'center': 'Close'}>,
        <Axes: title={'center': 'WAP'}>,
        <Axes: title={'center': 'No. of Shares'}>],
       [<Axes: title={'center': 'No. of Trades'}>,
        <Axes: title={'center': 'Total Turnover'}>,
        <Axes: title={'center': 'Deliverable Quantity'}>],
       [<Axes: title={'center': '% Deli. Qty to Traded Qty'}>,
        <Axes: title={'center': 'Spread H-L'}>,
        <Axes: title={'center': 'Spread C-O'}>]], dtype=object)
In [25]:
#plotting Heat Maps
plt.figure(figsize=(10,5))
c= df.corr()
sns.heatmap(c,cmap="BrBG",annot=True)
c
Out[25]:
Open High Low Close WAP No. of Shares No. of Trades Total Turnover Deliverable Quantity % Deli. Qty to Traded Qty Spread H-L Spread C-O
Open 1.000000 0.999359 0.999108 0.999692 0.999519 0.424709 0.235035 0.588435 0.433811 -0.339378 0.803052 -0.169538
High 0.999359 1.000000 0.999021 0.999413 0.999460 0.420926 0.235375 0.585213 0.430000 -0.338162 0.813143 -0.155049
Low 0.999108 0.999021 1.000000 0.999322 0.999504 0.410888 0.224782 0.575277 0.420581 -0.328516 0.786593 -0.148677
Close 0.999692 0.999413 0.999322 1.000000 0.999716 0.418452 0.231795 0.582777 0.428262 -0.333555 0.800995 -0.145009
WAP 0.999519 0.999460 0.999504 0.999716 1.000000 0.417035 0.230715 0.581522 0.426852 -0.332976 0.799259 -0.149420
No. of Shares 0.424709 0.420926 0.410888 0.418452 0.417035 1.000000 0.602534 0.921316 0.974295 -0.697808 0.468916 -0.315107
No. of Trades 0.235035 0.235375 0.224782 0.231795 0.230715 0.602534 1.000000 0.541898 0.574102 -0.528441 0.327714 -0.165539
Total Turnover 0.588435 0.585213 0.575277 0.582777 0.581522 0.921316 0.541898 1.000000 0.906034 -0.638952 0.599039 -0.317087
Deliverable Quantity 0.433811 0.430000 0.420581 0.428262 0.426852 0.974295 0.574102 0.906034 1.000000 -0.568492 0.468032 -0.288441
% Deli. Qty to Traded Qty -0.339378 -0.338162 -0.328516 -0.333555 -0.332976 -0.697808 -0.528441 -0.638952 -0.568492 1.000000 -0.397517 0.284462
Spread H-L 0.803052 0.813143 0.786593 0.800995 0.799259 0.468916 0.327714 0.599039 0.468032 -0.397517 1.000000 -0.207906
Spread C-O -0.169538 -0.155049 -0.148677 -0.145009 -0.149420 -0.315107 -0.165539 -0.317087 -0.288441 0.284462 -0.207906 1.000000
In [26]:
#Scatterplot
#We generally use scatter plots to find the correlation between two variables. 
#Here the scatter plots are plotted between Horsepower and Price and 
#we can see the plot below. With the plot given below, 
#we can easily draw a trend line. These features provide a good scattering of points.
fig, ax = plt.subplots(figsize=(10,6))
ax.scatter(df['Open'], df['High'])
ax.set_xlabel('Open')
ax.set_ylabel('High')
plt.show()
In [28]:
import missingno as msno

msno.matrix(df, labels=True, sort="descending");
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Local\Temp\ipykernel_12144\3630483539.py in <module>
      1 import missingno as msno
      2 
----> 3 msno.matrix(df, labels=True, sort="descending");

~\anaconda3\lib\site-packages\missingno\missingno.py in matrix(df, filter, n, p, sort, figsize, width_ratios, color, fontsize, labels, label_rotation, sparkline, freq, ax)
     71     # Remove extraneous default visual elements.
     72     ax0.set_aspect('auto')
---> 73     ax0.grid(b=False)
     74     ax0.xaxis.tick_top()
     75     ax0.xaxis.set_ticks_position('none')

~\anaconda3\lib\site-packages\matplotlib\axes\_base.py in grid(self, visible, which, axis, **kwargs)
   3194         _api.check_in_list(['x', 'y', 'both'], axis=axis)
   3195         if axis in ['x', 'both']:
-> 3196             self.xaxis.grid(visible, which=which, **kwargs)
   3197         if axis in ['y', 'both']:
   3198             self.yaxis.grid(visible, which=which, **kwargs)

~\anaconda3\lib\site-packages\matplotlib\axis.py in grid(self, visible, which, **kwargs)
   1653             gridkw['gridOn'] = (not self._major_tick_kw['gridOn']
   1654                                 if visible is None else visible)
-> 1655             self.set_tick_params(which='major', **gridkw)
   1656         self.stale = True
   1657 

~\anaconda3\lib\site-packages\matplotlib\axis.py in set_tick_params(self, which, reset, **kwargs)
    925         """
    926         _api.check_in_list(['major', 'minor', 'both'], which=which)
--> 927         kwtrans = self._translate_tick_params(kwargs)
    928 
    929         # the kwargs are stored in self._major/minor_tick_kw so that any

~\anaconda3\lib\site-packages\matplotlib\axis.py in _translate_tick_params(kw, reverse)
   1069         for key in kw_:
   1070             if key not in allowed_keys:
-> 1071                 raise ValueError(
   1072                     "keyword %s is not recognized; valid keywords are %s"
   1073                     % (key, allowed_keys))

ValueError: keyword grid_b is not recognized; valid keywords are ['size', 'width', 'color', 'tickdir', 'pad', 'labelsize', 'labelcolor', 'zorder', 'gridOn', 'tick1On', 'tick2On', 'label1On', 'label2On', 'length', 'direction', 'left', 'bottom', 'right', 'top', 'labelleft', 'labelbottom', 'labelright', 'labeltop', 'labelrotation', 'grid_agg_filter', 'grid_alpha', 'grid_animated', 'grid_antialiased', 'grid_clip_box', 'grid_clip_on', 'grid_clip_path', 'grid_color', 'grid_dash_capstyle', 'grid_dash_joinstyle', 'grid_dashes', 'grid_data', 'grid_drawstyle', 'grid_figure', 'grid_fillstyle', 'grid_gapcolor', 'grid_gid', 'grid_in_layout', 'grid_label', 'grid_linestyle', 'grid_linewidth', 'grid_marker', 'grid_markeredgecolor', 'grid_markeredgewidth', 'grid_markerfacecolor', 'grid_markerfacecoloralt', 'grid_markersize', 'grid_markevery', 'grid_mouseover', 'grid_path_effects', 'grid_picker', 'grid_pickradius', 'grid_rasterized', 'grid_sketch_params', 'grid_snap', 'grid_solid_capstyle', 'grid_solid_joinstyle', 'grid_transform', 'grid_url', 'grid_visible', 'grid_xdata', 'grid_ydata', 'grid_zorder', 'grid_aa', 'grid_c', 'grid_ds', 'grid_ls', 'grid_lw', 'grid_mec', 'grid_mew', 'grid_mfc', 'grid_mfcalt', 'grid_ms']
In [29]:
df.plot(lw=0, marker=".", subplots=True, layout=(-1, 4),
          figsize=(15, 30), markersize=1);
In [30]:
# Extract descriptive properties of non-numerical features
df.describe(exclude=["number", "datetime"])
Out[30]:
Date
count 1132
unique 1132
top 2017-02-06
freq 1
In [31]:
# Create figure object with 3 subplots
fig, axes = plt.subplots(ncols=1, nrows=3, figsize=(12, 8))

# Identify non-numerical features
df_non_numerical = df.select_dtypes(exclude=["number", "datetime"])

# Loop through features and put each subplot on a matplotlib axis object
for col, ax in zip(df_non_numerical.columns, axes.ravel()):

    # Selects one single feature and counts number of occurrences per unique value
    df_non_numerical[col].value_counts().plot(

        # Plots this information in a figure with log-scaled y-axis
        logy=True, title=col, lw=0, marker=".", ax=ax)
    
plt.tight_layout();
In [32]:
# Plots the histogram for each numerical feature in a separate subplot
df.hist(bins=25, figsize=(15, 25), layout=(-1, 5), edgecolor="black")
plt.tight_layout();
In [33]:
# Collects for each feature the most frequent entry
most_frequent_entry = df.mode()

# Checks for each entry if it contains the most frequent entry
df_freq = df.eq(most_frequent_entry.values, axis=1)

# Computes the mean of the 'is_most_frequent' occurrence
df_freq = df_freq.mean().sort_values(ascending=False)

# Show the 5 top features with the highest ratio of singular value content
display(df_freq.head())

# Visualize the 'df_freq' table
df_freq.plot.bar(figsize=(15, 4));
% Deli. Qty to Traded Qty    0.000883
Date                         0.000000
Open                         0.000000
High                         0.000000
Low                          0.000000
dtype: float64
In [34]:
# Creates mask to identify numerical features with more or less than 25 unique features
cols_continuous = df.select_dtypes(include="number").nunique() >= 25
In [35]:
# Create a new dataframe which only contains the continuous features
df_continuous = df[cols_continuous[cols_continuous].index]
df_continuous.shape
Out[35]:
(1132, 12)
In [36]:
import seaborn as sns

sns.pairplot(df_continuous, height=1.5, plot_kws={"s": 2, "alpha": 0.2});
In [37]:
# Create a new dataframe which doesn't contain the numerical continuous features
df_discrete = df[cols_continuous[~cols_continuous].index]
df_discrete.shape
Out[37]:
(1132, 0)
In [38]:
# Computes feature correlation
df_corr = df.corr(method="pearson")
In [41]:
# Create labels for the correlation matrix
import numpy as np
labels = np.where(np.abs(df_corr)>0.75, "S",
                  np.where(np.abs(df_corr)>0.5, "M",
                           np.where(np.abs(df_corr)>0.25, "W", "")))

# Plot correlation matrix
plt.figure(figsize=(15, 15))
sns.heatmap(df_corr, mask=np.eye(len(df_corr)), square=True,
            center=0, annot=labels, fmt='', linewidths=.5,
            cmap="vlag", cbar_kws={"shrink": 0.8});
In [42]:
#  Creates a mask to remove the diagonal and the upper triangle.
lower_triangle_mask = np.tril(np.ones(df_corr.shape), k=-1).astype("bool")

#  Stack all correlations, after applying the mask
df_corr_stacked = df_corr.where(lower_triangle_mask).stack().sort_values()

#  Showing the lowest and highest correlations in the correlation matrix
display(df_corr_stacked)
% Deli. Qty to Traded Qty  No. of Shares               -0.697808
                           Total Turnover              -0.638952
                           Deliverable Quantity        -0.568492
                           No. of Trades               -0.528441
Spread H-L                 % Deli. Qty to Traded Qty   -0.397517
                                                          ...   
WAP                        High                         0.999460
                           Low                          0.999504
                           Open                         0.999519
Close                      Open                         0.999692
WAP                        Close                        0.999716
Length: 66, dtype: float64
In [43]:
#correlation matrix
corrmat = df.corr()
f, ax = plt.subplots(figsize=(30, 30))
sns.heatmap(corrmat, vmax=.8, square=True);
In [44]:
#Bivariate Analysis
import seaborn as sns
sns.pairplot(df)
plt.show()
In [45]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(rc={'figure.figsize':(16, 4)})
plt.rcParams['figure.dpi'] = 150
In [46]:
df['High'].plot(linewidth=0.4)
Out[46]:
<Axes: >
In [47]:
df['Low'].plot(linewidth=0.4)
Out[47]:
<Axes: >
In [48]:
df.plot(linewidth=0.4)
Out[48]:
<Axes: >
In [49]:
df['Open'].plot(linewidth=0.4)
Out[49]:
<Axes: >
In [50]:
df['Close'].plot(linewidth=0.4)
Out[50]:
<Axes: >
In [51]:
cols_to_plot = ['Open', 'High', 'Low']
axes = df[cols_to_plot].plot(marker='.', alpha=0.5, linestyle='None',figsize=(14, 7), subplots=True)
for ax in axes:
    ax.set_ylabel('Daily Totals Stock')
In [52]:
fig, axes = plt.subplots(3, 1, figsize=(8, 7), sharex=True)
for name, ax in zip(['High', 'Open', 'Low'], axes):
  sns.boxplot(data=df, x='Date', y=name, ax=ax)
  ax.set_ylabel('Stock')
  ax.set_title(name)
  if ax != axes[-1]:
    ax.set_xlabel('')
In [53]:
import networkx as nx
In [54]:
df.skew().sort_values(ascending=False)
C:\Users\Prosenjeet Saha\AppData\Local\Temp\ipykernel_12144\4024944668.py:1: FutureWarning: Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.
  df.skew().sort_values(ascending=False)
Out[54]:
Spread H-L                   2.448261
Total Turnover               2.253303
Low                          2.103139
WAP                          2.059310
High                         2.055368
Close                        2.049954
Open                         2.035239
No. of Shares                1.880386
Deliverable Quantity         1.744694
No. of Trades                1.614688
Spread C-O                  -0.764600
% Deli. Qty to Traded Qty   -2.123731
dtype: float64
In [55]:
#Log Transform
Low_log=np.log(df['Low'])
Low_log.skew()
Out[55]:
0.6786644822512088
In [56]:
#Square Root Transform
Low_sqrt=np.sqrt(df['Low'])
Low_sqrt.skew()
Out[56]:
1.2837203476217665
In [57]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd 
from sklearn.preprocessing import PowerTransformer, QuantileTransformer
cols1 = ["Low", "High", "Close"]
def test_transformers(columns):
    pt = PowerTransformer()
    qt = QuantileTransformer(n_quantiles=500, output_distribution='normal')
    fig = plt.figure(figsize=(20,30))
    j = 1
    for i in columns:
        array = np.array(df[i]).reshape(-1, 1)
        y = pt.fit_transform(array)
        x = qt.fit_transform(array)
        plt.subplot(3,3,j)
        sns.histplot(array, bins = 50, kde = True)
        plt.title(f"Original Distribution for {i}")
        plt.subplot(3,3,j+1)
        sns.histplot(x, bins = 50, kde = True)
        plt.title(f"Quantile Transform for {i}")
        plt.subplot(3,3,j+2)
        sns.histplot(y, bins = 50, kde = True)
        plt.title(f"Power Transform for {i}")
        j += 3
test_transformers(cols1)
In [58]:
from matplotlib import pyplot
df.plot()
pyplot.show()
In [67]:
import pandas as pd  
import matplotlib.pyplot as plt  
import statsmodels.api as sm  
from statsmodels.formula.api import ols  
import seaborn as sns  
import numpy as np  
import pandas.tseries  
plt.style.use('fivethirtyeight')
In [71]:
#Understanding the distribution of Weight
f, ax = plt.subplots( figsize = (11,9) )  
plt.title( 'Weight Distributions among Sample' )  
plt.ylabel( 'Stock' )  
sns.distplot( df.Low )  
plt.show()
C:\Users\Prosenjeet Saha\anaconda3\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
  warnings.warn(msg, FutureWarning)
In [ ]: